LINEAR REGRESSION:

In [44]:
import warnings
warnings.filterwarnings("ignore")
import time
import random
from math import *
import operator
import pandas as pd
import numpy as np
from scipy import stats

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

# load make_blobs to simulate data
from sklearn.datasets import make_blobs
from sklearn.datasets import make_classification

# import the ML algorithm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.tools.eval_measures import rmse
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn import metrics
import statsmodels.api as sm
import statsmodels
import statsmodels.formula.api as smf
import os
import statistics


# pre-processing:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing import Imputer

# import libraries for model validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import libraries for metrics and reporting
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
import xgboost
import math
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import mean_squared_error
from pandas.plotting import scatter_matrix
In [17]:
df_house=pd.read_csv("C://Users//disoj//Desktop//Data Science Class//Simplilearn//SL Machine Learning Classes//Machine Learning Project solutions//house_price.csv")
In [18]:
df_house.shape
Out[18]:
(20640, 10)
In [19]:
df_house.head()
Out[19]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity median_house_value
0 -122.23 37.88 41 880 129.0 322 126 8.3252 NEAR BAY 452600
1 -122.22 37.86 21 7099 1106.0 2401 1138 8.3014 NEAR BAY 358500
2 -122.24 37.85 52 1467 190.0 496 177 7.2574 NEAR BAY 352100
3 -122.25 37.85 52 1274 235.0 558 219 5.6431 NEAR BAY 341300
4 -122.25 37.85 52 1627 280.0 565 259 3.8462 NEAR BAY 342200
In [20]:
df_house.describe()
Out[20]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
In [21]:
# Checking missing values
df_house.isnull().sum()
Out[21]:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64
In [46]:
mode=statistics.mode(df_house['total_bedrooms'])
In [47]:
mode
Out[47]:
280.0
In [23]:
# Filling missing 'total_bedrooms' values with mode since mean doesn't make sense for total bedrooms:
df_house['total_bedrooms']=df_house['total_bedrooms'].fillna(df_house['total_bedrooms'].mode()[0])
In [24]:
# Again Checking missing values
df_house.isnull().sum()
Out[24]:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64
In [25]:
df_house.dtypes
Out[25]:
longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
dtype: object
In [26]:
df_house['ocean_proximity'].unique()
Out[26]:
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)
In [27]:
df_house["ocean_proximity"] = df_house["ocean_proximity"].astype('category')

df_house.dtypes
Out[27]:
longitude              float64
latitude               float64
housing_median_age       int64
total_rooms              int64
total_bedrooms         float64
population               int64
households               int64
median_income          float64
ocean_proximity       category
median_house_value       int64
dtype: object
In [28]:
df_nhouse = pd.get_dummies(df_house, prefix_sep='-',drop_first=True,columns=["ocean_proximity"])
In [29]:
df_nhouse.head()
Out[29]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity-INLAND ocean_proximity-ISLAND ocean_proximity-NEAR BAY ocean_proximity-NEAR OCEAN
0 -122.23 37.88 41 880 129.0 322 126 8.3252 452600 0 0 1 0
1 -122.22 37.86 21 7099 1106.0 2401 1138 8.3014 358500 0 0 1 0
2 -122.24 37.85 52 1467 190.0 496 177 7.2574 352100 0 0 1 0
3 -122.25 37.85 52 1274 235.0 558 219 5.6431 341300 0 0 1 0
4 -122.25 37.85 52 1627 280.0 565 259 3.8462 342200 0 0 1 0
In [30]:
df_nhouse.dtypes
Out[30]:
longitude                     float64
latitude                      float64
housing_median_age              int64
total_rooms                     int64
total_bedrooms                float64
population                      int64
households                      int64
median_income                 float64
median_house_value              int64
ocean_proximity-INLAND          uint8
ocean_proximity-ISLAND          uint8
ocean_proximity-NEAR BAY        uint8
ocean_proximity-NEAR OCEAN      uint8
dtype: object
In [31]:
# Lets apply standard Scaler in this case,unlike in Linear Regression
scaler=StandardScaler(copy=True,with_mean=True,with_std=True
                     ).fit(df_nhouse)
rescaled_dfnhouse=scaler.transform(df_nhouse)
In [32]:
colnames = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_price','ocean_prox_INLAND','ocean_prox_ISLAND','ocean_prox_NEAR BAY','ocean_prox_NEAR OCEAN']
df_newh= pd.DataFrame(rescaled_dfnhouse, columns=colnames)
In [33]:
df_newh.head()
Out[33]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_price ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
0 -1.327835 1.052548 0.982143 -0.804819 -0.967245 -0.974429 -0.977033 2.344766 2.129631 -0.681889 -0.015566 2.830742 -0.384466
1 -1.322844 1.043185 -0.607019 2.045890 1.358707 0.861439 1.669961 2.332238 1.314156 -0.681889 -0.015566 2.830742 -0.384466
2 -1.332827 1.038503 1.856182 -0.535746 -0.822021 -0.820777 -0.843637 1.782699 1.258693 -0.681889 -0.015566 2.830742 -0.384466
3 -1.337818 1.038503 1.856182 -0.624215 -0.714889 -0.766028 -0.733781 0.932968 1.165100 -0.681889 -0.015566 2.830742 -0.384466
4 -1.337818 1.038503 1.856182 -0.462404 -0.607758 -0.759847 -0.629157 -0.012881 1.172900 -0.681889 -0.015566 2.830742 -0.384466
In [34]:
## Correlations:
df_newh.corr()
Out[34]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_price ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
longitude 1.000000 -0.924664 -0.108197 0.044568 0.068831 0.099773 0.055310 -0.015176 -0.045967 -0.055575 0.009446 -0.474489 0.045509
latitude -0.924664 1.000000 0.011173 -0.036100 -0.066147 -0.108785 -0.071035 -0.079809 -0.144160 0.351166 -0.016572 0.358771 -0.160818
housing_median_age -0.108197 0.011173 1.000000 -0.361262 -0.318710 -0.296244 -0.302916 -0.119034 0.105623 -0.236645 0.017020 0.255172 0.021622
total_rooms 0.044568 -0.036100 -0.361262 1.000000 0.925723 0.857126 0.918484 0.198050 0.134153 0.025624 -0.007572 -0.023022 -0.009175
total_bedrooms 0.068831 -0.066147 -0.318710 0.925723 1.000000 0.871989 0.972731 -0.007511 0.049406 -0.005737 -0.004257 -0.019467 0.000377
population 0.099773 -0.108785 -0.296244 0.857126 0.871989 1.000000 0.907222 0.004834 -0.024650 -0.020732 -0.010412 -0.060880 -0.024264
households 0.055310 -0.071035 -0.302916 0.918484 0.972731 0.907222 1.000000 0.013033 0.065843 -0.039402 -0.009077 -0.010093 0.001714
median_income -0.015176 -0.079809 -0.119034 0.198050 -0.007511 0.004834 0.013033 1.000000 0.688075 -0.237496 -0.009228 0.056197 0.027344
median_house_price -0.045967 -0.144160 0.105623 0.134153 0.049406 -0.024650 0.065843 0.688075 1.000000 -0.484859 0.023416 0.160284 0.141862
ocean_prox_INLAND -0.055575 0.351166 -0.236645 0.025624 -0.005737 -0.020732 -0.039402 -0.237496 -0.484859 1.000000 -0.010614 -0.240887 -0.262163
ocean_prox_ISLAND 0.009446 -0.016572 0.017020 -0.007572 -0.004257 -0.010412 -0.009077 -0.009228 0.023416 -0.010614 1.000000 -0.005499 -0.005985
ocean_prox_NEAR BAY -0.474489 0.358771 0.255172 -0.023022 -0.019467 -0.060880 -0.010093 0.056197 0.160284 -0.240887 -0.005499 1.000000 -0.135818
ocean_prox_NEAR OCEAN 0.045509 -0.160818 0.021622 -0.009175 0.000377 -0.024264 0.001714 0.027344 0.141862 -0.262163 -0.005985 -0.135818 1.000000
In [35]:
# Generating the correlation heat-map
corrmat=df_newh.corr()
top_corr_features=corrmat.index
plt.figure(figsize=(20,20))
sns.heatmap( df_newh[top_corr_features].corr(),annot=True,cmap="RdYlGn");
In [36]:
# Histograms:
df_newh.hist(figsize=(15,15), xlabelsize = 10);
In [37]:
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(df_newh, diag_kind='kde',plot_kws={'alpha':0.6,'s':80,'edgecolor':'k'},size=4)
Out[37]:
<seaborn.axisgrid.PairGrid at 0x236fac51e80>
In [38]:
# extract the data, into numpy array:
X_features =df_newh.drop(['median_house_price'],axis=1)
y_actual=df_newh['median_house_price']
In [39]:
X_features.head()
Out[39]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
0 -1.327835 1.052548 0.982143 -0.804819 -0.967245 -0.974429 -0.977033 2.344766 -0.681889 -0.015566 2.830742 -0.384466
1 -1.322844 1.043185 -0.607019 2.045890 1.358707 0.861439 1.669961 2.332238 -0.681889 -0.015566 2.830742 -0.384466
2 -1.332827 1.038503 1.856182 -0.535746 -0.822021 -0.820777 -0.843637 1.782699 -0.681889 -0.015566 2.830742 -0.384466
3 -1.337818 1.038503 1.856182 -0.624215 -0.714889 -0.766028 -0.733781 0.932968 -0.681889 -0.015566 2.830742 -0.384466
4 -1.337818 1.038503 1.856182 -0.462404 -0.607758 -0.759847 -0.629157 -0.012881 -0.681889 -0.015566 2.830742 -0.384466

Correlations between features and target:

In [40]:
features=['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_prox_INLAND','ocean_prox_ISLAND','ocean_prox_NEAR BAY','ocean_prox_NEAR OCEAN']
target=df_newh['median_house_price'].name
correlations = {}
for f in features:
    data_temp = df_newh[[f,target]]
    x1 = data_temp[f].values
    x2 = data_temp[target].values
    key = f + ' vs ' + target
    correlations[key] = pearsonr(x1,x2)[0]
    
In [41]:
data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]
Out[41]:
Value
median_income vs median_house_price 0.688075
ocean_prox_INLAND vs median_house_price -0.484859
ocean_prox_NEAR BAY vs median_house_price 0.160284
latitude vs median_house_price -0.144160
ocean_prox_NEAR OCEAN vs median_house_price 0.141862
total_rooms vs median_house_price 0.134153
housing_median_age vs median_house_price 0.105623
households vs median_house_price 0.065843
total_bedrooms vs median_house_price 0.049406
longitude vs median_house_price -0.045967
population vs median_house_price -0.024650
ocean_prox_ISLAND vs median_house_price 0.023416
In [42]:
X_features.values
y_actual.values
Out[42]:
array([ 2.12963148,  1.31415614,  1.25869341, ..., -0.99274649,
       -1.05860847, -1.01787803])
In [43]:
y = df_newh.loc[:,['median_income','ocean_prox_INLAND',target]].sort_values(target, ascending=True).values
x = np.arange(y.shape[0])

We can see that the top 5 features are the most correlated features with the target "price" Let's plot the best 2 regressors jointly

In [112]:
plt.subplot(3,1,1)
plt.plot(x,y[:,0])
plt.title('median income and ocean_proximity_INLAND vs house price')
plt.ylabel('median_income')
plt.subplot(3,1,2)
plt.plot(x,y[:,1])
plt.ylabel('ocean_prox_INLAND')
plt.subplot(3,1,3)
plt.plot(x,y[:,2],'r')
plt.ylabel("house price")

plt.show()

Feature Selection Processes:

In [113]:
# For each X_features, calculate VIF and save in dataframe
VIF = pd.DataFrame()

VIF["VIF_Factor"] = [variance_inflation_factor(X_features.values, i) for i in range(X_features.shape[1])]

VIF["feature"] = X_features.columns
In [114]:
VIF.sort_values(['VIF_Factor'], ascending=False).round(5)
Out[114]:
VIF_Factor feature
6 28.63783 households
4 27.50360 total_bedrooms
1 19.93028 latitude
0 18.03469 longitude
3 12.37771 total_rooms
5 6.34502 population
8 2.85434 ocean_prox_INLAND
7 1.74293 median_income
10 1.56571 ocean_prox_NEAR BAY
2 1.32192 housing_median_age
11 1.19715 ocean_prox_NEAR OCEAN
9 1.00204 ocean_prox_ISLAND

Lets drop the variables with VIF>3.5:

In [115]:
df_final =df_newh.drop(['population','total_bedrooms','households','total_rooms','longitude','latitude'],axis=1)
In [116]:
df_final.head()
Out[116]:
housing_median_age median_income median_house_price ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
0 0.982143 2.344766 2.129631 -0.681889 -0.015566 2.830742 -0.384466
1 -0.607019 2.332238 1.314156 -0.681889 -0.015566 2.830742 -0.384466
2 1.856182 1.782699 1.258693 -0.681889 -0.015566 2.830742 -0.384466
3 1.856182 0.932968 1.165100 -0.681889 -0.015566 2.830742 -0.384466
4 1.856182 -0.012881 1.172900 -0.681889 -0.015566 2.830742 -0.384466
In [117]:
# 2 Again applying feature_importance technique for the further feature selection:
X_train,X_test,y_train,y_test=train_test_split(X_features,y_actual,test_size=0.20,random_state=101)
In [118]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)
Training Features Shape: (16512, 12)
Training Labels Shape: (16512,)
Testing Features Shape: (4128, 12)
Testing Labels Shape: (4128,)
In [119]:
df_final.shape
Out[119]:
(20640, 7)

Visualizing Outliers through Boxplots:

In [139]:
#Checking Outliers: 
plt.figure(figsize=(25,15))
boxplot=df_final.boxplot(patch_artist=True)

Detecting and Removing Outliers:

In [140]:
# Calculation of IQR:
Q1=df_final.quantile(0.25)
Q3=df_final.quantile(0.75)
IQR=Q3-Q1
print(IQR)
housing_median_age       1.509703
median_income            1.147425
median_house_price       1.257661
ocean_prox_INLAND        2.148403
ocean_prox_ISLAND        0.000000
ocean_prox_NEAR BAY      0.000000
ocean_prox_NEAR OCEAN    0.000000
dtype: float64
In [141]:
# IQR for 'median_income', detecting outlier with IQR, TRUE=outliers, FALSE=valid data
print(df_final<(Q1-1.5*IQR))
(df_final>(Q3+1.5*IQR))
       housing_median_age  median_income  median_house_price  \
0                   False          False               False   
1                   False          False               False   
2                   False          False               False   
3                   False          False               False   
4                   False          False               False   
5                   False          False               False   
6                   False          False               False   
7                   False          False               False   
8                   False          False               False   
9                   False          False               False   
10                  False          False               False   
11                  False          False               False   
12                  False          False               False   
13                  False          False               False   
14                  False          False               False   
15                  False          False               False   
16                  False          False               False   
17                  False          False               False   
18                  False          False               False   
19                  False          False               False   
20                  False          False               False   
21                  False          False               False   
22                  False          False               False   
23                  False          False               False   
24                  False          False               False   
25                  False          False               False   
26                  False          False               False   
27                  False          False               False   
28                  False          False               False   
29                  False          False               False   
...                   ...            ...                 ...   
20610               False          False               False   
20611               False          False               False   
20612               False          False               False   
20613               False          False               False   
20614               False          False               False   
20615               False          False               False   
20616               False          False               False   
20617               False          False               False   
20618               False          False               False   
20619               False          False               False   
20620               False          False               False   
20621               False          False               False   
20622               False          False               False   
20623               False          False               False   
20624               False          False               False   
20625               False          False               False   
20626               False          False               False   
20627               False          False               False   
20628               False          False               False   
20629               False          False               False   
20630               False          False               False   
20631               False          False               False   
20632               False          False               False   
20633               False          False               False   
20634               False          False               False   
20635               False          False               False   
20636               False          False               False   
20637               False          False               False   
20638               False          False               False   
20639               False          False               False   

       ocean_prox_INLAND  ocean_prox_ISLAND  ocean_prox_NEAR BAY  \
0                  False              False                False   
1                  False              False                False   
2                  False              False                False   
3                  False              False                False   
4                  False              False                False   
5                  False              False                False   
6                  False              False                False   
7                  False              False                False   
8                  False              False                False   
9                  False              False                False   
10                 False              False                False   
11                 False              False                False   
12                 False              False                False   
13                 False              False                False   
14                 False              False                False   
15                 False              False                False   
16                 False              False                False   
17                 False              False                False   
18                 False              False                False   
19                 False              False                False   
20                 False              False                False   
21                 False              False                False   
22                 False              False                False   
23                 False              False                False   
24                 False              False                False   
25                 False              False                False   
26                 False              False                False   
27                 False              False                False   
28                 False              False                False   
29                 False              False                False   
...                  ...                ...                  ...   
20610              False              False                False   
20611              False              False                False   
20612              False              False                False   
20613              False              False                False   
20614              False              False                False   
20615              False              False                False   
20616              False              False                False   
20617              False              False                False   
20618              False              False                False   
20619              False              False                False   
20620              False              False                False   
20621              False              False                False   
20622              False              False                False   
20623              False              False                False   
20624              False              False                False   
20625              False              False                False   
20626              False              False                False   
20627              False              False                False   
20628              False              False                False   
20629              False              False                False   
20630              False              False                False   
20631              False              False                False   
20632              False              False                False   
20633              False              False                False   
20634              False              False                False   
20635              False              False                False   
20636              False              False                False   
20637              False              False                False   
20638              False              False                False   
20639              False              False                False   

       ocean_prox_NEAR OCEAN  
0                      False  
1                      False  
2                      False  
3                      False  
4                      False  
5                      False  
6                      False  
7                      False  
8                      False  
9                      False  
10                     False  
11                     False  
12                     False  
13                     False  
14                     False  
15                     False  
16                     False  
17                     False  
18                     False  
19                     False  
20                     False  
21                     False  
22                     False  
23                     False  
24                     False  
25                     False  
26                     False  
27                     False  
28                     False  
29                     False  
...                      ...  
20610                  False  
20611                  False  
20612                  False  
20613                  False  
20614                  False  
20615                  False  
20616                  False  
20617                  False  
20618                  False  
20619                  False  
20620                  False  
20621                  False  
20622                  False  
20623                  False  
20624                  False  
20625                  False  
20626                  False  
20627                  False  
20628                  False  
20629                  False  
20630                  False  
20631                  False  
20632                  False  
20633                  False  
20634                  False  
20635                  False  
20636                  False  
20637                  False  
20638                  False  
20639                  False  

[20640 rows x 7 columns]
Out[141]:
housing_median_age median_income median_house_price ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
0 False True False False False True False
1 False True False False False True False
2 False False False False False True False
3 False False False False False True False
4 False False False False False True False
5 False False False False False True False
6 False False False False False True False
7 False False False False False True False
8 False False False False False True False
9 False False False False False True False
10 False False False False False True False
11 False False False False False True False
12 False False False False False True False
13 False False False False False True False
14 False False False False False True False
15 False False False False False True False
16 False False False False False True False
17 False False False False False True False
18 False False False False False True False
19 False False False False False True False
20 False False False False False True False
21 False False False False False True False
22 False False False False False True False
23 False False False False False True False
24 False False False False False True False
25 False False False False False True False
26 False False False False False True False
27 False False False False False True False
28 False False False False False True False
29 False False False False False True False
... ... ... ... ... ... ... ...
20610 False False False False False False False
20611 False False False False False False False
20612 False False False False False False False
20613 False False False False False False False
20614 False False False False False False False
20615 False False False False False False False
20616 False False False False False False False
20617 False False False False False False False
20618 False False False False False False False
20619 False False False False False False False
20620 False False False False False False False
20621 False False False False False False False
20622 False False False False False False False
20623 False False False False False False False
20624 False False False False False False False
20625 False False False False False False False
20626 False False False False False False False
20627 False False False False False False False
20628 False False False False False False False
20629 False False False False False False False
20630 False False False False False False False
20631 False False False False False False False
20632 False False False False False False False
20633 False False False False False False False
20634 False False False False False False False
20635 False False False False False False False
20636 False False False False False False False
20637 False False False False False False False
20638 False False False False False False False
20639 False False False False False False False

20640 rows × 7 columns

In [142]:
# Removing Outliers:
df_out = df_final[~((df_final < (Q1 - 1.5 * IQR)) |(df_final > (Q3 + 1.5 * IQR))).any(axis=1)]
df_out.shape
Out[142]:
(14887, 7)
In [143]:
df_out.head()
Out[143]:
housing_median_age median_income median_house_price ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
701 0.267020 1.151046 1.942445 -0.681889 -0.015566 -0.353264 -0.384466
830 -1.560516 0.408011 0.087910 -0.681889 -0.015566 -0.353264 -0.384466
859 -0.607019 0.391588 0.353091 -0.681889 -0.015566 -0.353264 -0.384466
860 -1.083767 0.973025 0.664202 -0.681889 -0.015566 -0.353264 -0.384466
861 -0.686477 -0.098681 0.087043 -0.681889 -0.015566 -0.353264 -0.384466
In [144]:
df_final.shape
Out[144]:
(20640, 7)

Visualizations after Removal Outliers:

In [145]:
# Lets see the visualization one more time:
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(df_out, diag_kind='kde',plot_kws={'alpha':0.6,'s':80,'edgecolor':'k'},size=4) 
Out[145]:
<seaborn.axisgrid.PairGrid at 0x28ba8eb6e48>
In [146]:
#Checking Outliers: 
plt.figure(figsize=(25,15))
boxplot=df_out.boxplot(patch_artist=True)
In [147]:
# extract further remaining data into numpy array:
X_out =df_out.drop(['median_house_price'],axis=1)
y_actual=df_out['median_house_price']
In [165]:
X_out.head()
Out[165]:
housing_median_age median_income ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
701 0.267020 1.151046 -0.681889 -0.015566 -0.353264 -0.384466
830 -1.560516 0.408011 -0.681889 -0.015566 -0.353264 -0.384466
859 -0.607019 0.391588 -0.681889 -0.015566 -0.353264 -0.384466
860 -1.083767 0.973025 -0.681889 -0.015566 -0.353264 -0.384466
861 -0.686477 -0.098681 -0.681889 -0.015566 -0.353264 -0.384466
In [148]:
# Splitting data:
X_train,X_test,y_train,y_test=train_test_split(X_out,y_actual,test_size=0.20,random_state=101)
In [149]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)
Training Features Shape: (11909, 6)
Training Labels Shape: (11909,)
Testing Features Shape: (2978, 6)
Testing Labels Shape: (2978,)
In [150]:
# New model without Outliers:
lin = LinearRegression()

# fit the model to the training data (learn the coefficients)
lin.fit(X_train, y_train)
Out[150]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [151]:
# pair the feature names with the coefficients
list(zip(X_train, lin.coef_))
Out[151]:
[('housing_median_age', 0.04528036251616816),
 ('median_income', 0.5537966379760776),
 ('ocean_prox_INLAND', -0.2868217450990511),
 ('ocean_prox_ISLAND', 0.0),
 ('ocean_prox_NEAR BAY', 1.1677900694518508e-32),
 ('ocean_prox_NEAR OCEAN', 1.1677900694518508e-32)]
In [152]:
# make predictions on the testing set
y_pred = lin.predict(X_test)
print(y_pred)
[ 0.26632717 -0.97822624 -0.73475241 ... -0.48906919 -0.01009481
  0.14322925]
In [83]:
# Fitting Polynomial Regression to the dataset 
poly = PolynomialFeatures(degree = 2) 
X_poly = poly.fit_transform(X_train) 
  
poly.fit(X_poly, y_train) 

lin2 = LinearRegression() 
lin2.fit(X_poly, y_train)
Out[83]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [84]:
list(zip(X_train, lin2.coef_))
Out[84]:
[('housing_median_age', -1391.721226094419),
 ('median_income', 4616283.400233289),
 ('ocean_prox_INLAND', 64941.34080340499),
 ('ocean_prox_ISLAND', -46704415.741805695),
 ('ocean_prox_NEAR BAY', 1644509.8551970534),
 ('ocean_prox_NEAR OCEAN', 39254909.80430899)]
In [85]:
X_train.shape
Out[85]:
(11909, 6)
In [86]:
X_test.shape
Out[86]:
(2978, 6)
In [87]:
poly = PolynomialFeatures(degree = 3) 
X_poly = poly.fit_transform(X_train) 
  
poly.fit(X_poly, y_train) 

lin3 = LinearRegression() 
lin3.fit(X_poly, y_train)
Out[87]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [88]:
list(zip(X_train, lin3.coef_))
Out[88]:
[('housing_median_age', 104.80319105545377),
 ('median_income', -231611.52117824426),
 ('ocean_prox_INLAND', -20225.214057437643),
 ('ocean_prox_ISLAND', 3398608.545076937),
 ('ocean_prox_NEAR BAY', -60223.597297978056),
 ('ocean_prox_NEAR OCEAN', -1517072.3650305048)]
In [89]:
errors = abs(y_pred - y_test)
In [90]:
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / y_test))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')
Mean Absolute Error: 0.36 degrees.
Accuracy: 96.62 %.
In [95]:
# Model evaluation metrics for regression(degree=1)

print('y-intercept             : ', lin.intercept_)
print('beta coefficients       : ', lin.coef_)
print('Mean Abs Error   MAE    : ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Sq  Error MSE      : ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Sq Error RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2 value                : ', metrics.r2_score(y_test, y_pred))
y-intercept             :  -0.0922982758098485
beta coefficients       :  [ 4.52803625e-02  5.53796638e-01 -2.86821745e-01  0.00000000e+00
  1.16779007e-32  1.16779007e-32]
Mean Abs Error   MAE    :  0.3616544301540793
Mean Sq  Error MSE      :  0.23901782341524358
Root Mean Sq Error RMSE :  0.4888944910870275
r2 value                :  0.5770579553401091
In [96]:
X_train = sm.add_constant(X_train) ## let's add an intercept (beta_0) to our model

X_test    = sm.add_constant(X_test)

lm3 = sm.OLS(y_train,X_train).fit()
lm3.summary()
Out[96]:
OLS Regression Results
Dep. Variable: median_house_price R-squared: 0.574
Model: OLS Adj. R-squared: 0.574
Method: Least Squares F-statistic: 5351.
Date: Tue, 07 May 2019 Prob (F-statistic): 0.00
Time: 22:17:10 Log-Likelihood: -8720.6
No. Observations: 11909 AIC: 1.745e+04
Df Residuals: 11905 BIC: 1.748e+04
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
housing_median_age 0.0453 0.005 8.748 0.000 0.035 0.055
median_income 0.5538 0.007 85.153 0.000 0.541 0.567
ocean_prox_INLAND -0.2868 0.005 -61.361 0.000 -0.296 -0.278
ocean_prox_ISLAND 0.0053 0.000 19.102 0.000 0.005 0.006
ocean_prox_NEAR BAY 0.1195 0.006 19.102 0.000 0.107 0.132
ocean_prox_NEAR OCEAN 0.1301 0.007 19.102 0.000 0.117 0.143
Omnibus: 3126.417 Durbin-Watson: 2.005
Prob(Omnibus): 0.000 Jarque-Bera (JB): 8911.725
Skew: 1.380 Prob(JB): 0.00
Kurtosis: 6.216 Cond. No. 2.22e+17


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.41e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [97]:
# fitted values (need a constant term for intercept)
model_fitted_y = lm3.fittedvalues

# model residuals
model_residuals = lm3.resid

# normalized residuals
model_norm_residuals = lm3.get_influence().resid_studentized_internal

# absolute squared normalized residuals
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))

# absolute residuals
model_abs_resid = np.abs(model_residuals)

# leverage, from statsmodels internals
model_leverage = lm3.get_influence().hat_matrix_diag

# cook's distance, from statsmodels internals
model_cooks = lm3.get_influence().cooks_distance[0]

Diagnosing Homoscedasticity

In [100]:
# Checking the presence of heteroscedasticity:
name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']

bp = statsmodels.stats.diagnostic.het_breuschpagan(lm3.resid, lm3.model.exog)
bp

pd.DataFrame(name, bp)
Out[100]:
0
3.053173e+02 Lagrange multiplier statistic
7.201150e-64 p-value
1.044152e+02 f-value
1.010939e-66 f p-value

Tests are significant meaning that data violates the assumption of homoscedasticity, i.e. heteroscedasticity is present in the data.In another word, Since our p-value is less than 0.05, this indicates that heteroscedasticity is present, and we reject the null hypothesis of homoscedasticity.

Diagnosing Normality:

In [101]:
stats.probplot(lm3.resid, plot= plt)
plt.title("Model1 Residuals Probability Plot");

the residuals which will be represented as dots (in blue) should fall on the red line. This plot indicates that the model’s residuals are not normally distributed.

Kolmogorov-Smirnov test (for normality)

In [102]:
stats.kstest(lm3.resid, 'norm')
Out[102]:
KstestResult(statistic=0.21429823787621488, pvalue=0.0)

The test is significant which indicates that the model’s residuals are not normally distributed. REJECT the null hypothesis (that the residuals are normally distributed).

Bonus exercise:

Perform Linear Regression with one independent variable : • Extract just the median_income column from the independent variables (from X_train and X_test). • Perform Linear Regression to predict housing values based on median_income. • Predict output for test dataset using the fitted model. • Plot the fitted model for training data as well as for test data to check if the fitted model satisfies the test data.

In [105]:
df_house.head()
Out[105]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income ocean_proximity median_house_value
0 -122.23 37.88 41 880 129.0 322 126 8.3252 NEAR BAY 452600
1 -122.22 37.86 21 7099 1106.0 2401 1138 8.3014 NEAR BAY 358500
2 -122.24 37.85 52 1467 190.0 496 177 7.2574 NEAR BAY 352100
3 -122.25 37.85 52 1274 235.0 558 219 5.6431 NEAR BAY 341300
4 -122.25 37.85 52 1627 280.0 565 259 3.8462 NEAR BAY 342200
In [175]:
df_out.head()
Out[175]:
housing_median_age median_income median_house_price ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
701 0.267020 1.151046 1.942445 -0.681889 -0.015566 -0.353264 -0.384466
830 -1.560516 0.408011 0.087910 -0.681889 -0.015566 -0.353264 -0.384466
859 -0.607019 0.391588 0.353091 -0.681889 -0.015566 -0.353264 -0.384466
860 -1.083767 0.973025 0.664202 -0.681889 -0.015566 -0.353264 -0.384466
861 -0.686477 -0.098681 0.087043 -0.681889 -0.015566 -0.353264 -0.384466
In [166]:
X_medin =df_out.drop(['median_house_price','housing_median_age','ocean_prox_INLAND','ocean_prox_ISLAND','ocean_prox_NEAR BAY','ocean_prox_NEAR OCEAN'],axis=1)
y_actual=df_out['median_house_price']
In [167]:
X_medin.head()
Out[167]:
median_income
701 1.151046
830 0.408011
859 0.391588
860 0.973025
861 -0.098681
In [168]:
print(X_medin.shape)
print(y_actual.shape)
(14887, 1)
(14887,)
In [169]:
# Splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_medin, y_actual, random_state=1, test_size=0.2)
In [170]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)
Training Features Shape: (11909, 1)
Training Labels Shape: (11909,)
Testing Features Shape: (2978, 1)
Testing Labels Shape: (2978,)
In [171]:
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14

# Pandas scatter plot
df_out.plot(kind='scatter', x='median_income', y='median_house_price', alpha=0.2)
Out[171]:
<matplotlib.axes._subplots.AxesSubplot at 0x28ba74800f0>
In [172]:
# Seaborn scatter plot with regression line
sns.lmplot(x='median_income', y='median_house_price', data=df_out, aspect=1.5, scatter_kws={'alpha':0.2})
Out[172]:
<seaborn.axisgrid.FacetGrid at 0x28ba74c9748>
In [173]:
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)
Out[173]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [174]:
# print the coefficients
print(linreg.intercept_)
print(linreg.coef_)
-0.15637811579810473
[0.65075579]
In [175]:
list(zip(X_train, linreg.coef_))
Out[175]:
[('median_income', 0.6507557887711789)]
In [176]:
# OR:
# visualize the relationship between the features and the response using scatterplots

plt.scatter(X_train, y_train, color = 'blue') 
  
plt.plot(X_test, linreg.predict(X_test), color = 'red') 
plt.title('Linear Regression') 
plt.xlabel('median_income') 
plt.ylabel('median_house_price') 
Out[176]:
Text(0,0.5,'median_house_price')
In [177]:
from sklearn.preprocessing import PolynomialFeatures
poly  = PolynomialFeatures(degree=2)
poly_X= poly.fit_transform(X_train)
In [178]:
linreg2 = LinearRegression()
linreg2.fit(poly_X, y_train)
Out[178]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [179]:
print(linreg2.intercept_)
print(linreg2.coef_)
-0.18327692063053574
[0.         0.64539124 0.04279762]
In [180]:
list(zip(X_train, linreg2.coef_))
Out[180]:
[('median_income', 0.0)]
In [181]:
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train, linreg2.predict(poly.fit_transform(X_train)), color='blue')
plt.show()
In [182]:
from sklearn.preprocessing import PolynomialFeatures
poly  = PolynomialFeatures(degree=3)
poly_X= poly.fit_transform(X_train)
In [183]:
linreg3 = LinearRegression()
linreg3.fit(poly_X, y_train)
Out[183]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [184]:
print(linreg3.intercept_)
print(linreg3.coef_)
-0.183935663248267
[ 0.          0.6627552   0.04797368 -0.01173558]
In [223]:
list(zip(X_train, linreg3.coef_))
Out[223]:
[('median_income', 0.0)]
In [186]:
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train, linreg3.predict(poly.fit_transform(X_train)), color='blue')
plt.show()
In [187]:
from sklearn.preprocessing import PolynomialFeatures
poly  = PolynomialFeatures(degree=4)
poly_X= poly.fit_transform(X_train)
In [188]:
linreg4 = LinearRegression()
linreg4.fit(poly_X, y_train)
Out[188]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [189]:
print(linreg4.intercept_)
print(linreg4.coef_)
-0.16803352141068423
[ 0.          0.67794567 -0.01389087 -0.03044438  0.02639198]
In [190]:
list(zip(X_train, linreg4.coef_))
Out[190]:
[('median_income', 0.0)]
In [191]:
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train, linreg4.predict(poly.fit_transform(X_train)), color='blue')
plt.show()
In [192]:
df_out.corr()
Out[192]:
housing_median_age median_income median_house_price ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
housing_median_age 1.000000e+00 -2.112632e-01 1.993449e-02 -2.072568e-01 -1.480939e-15 -3.230734e-15 2.386826e-15
median_income -2.112632e-01 1.000000e+00 6.448102e-01 -2.555571e-01 -3.788144e-16 2.007935e-16 4.623942e-16
median_house_price 1.993449e-02 6.448102e-01 1.000000e+00 -5.469543e-01 -2.168069e-16 -1.943587e-16 7.088227e-16
ocean_prox_INLAND -2.072568e-01 -2.555571e-01 -5.469543e-01 1.000000e+00 1.307923e-14 1.621530e-14 2.445105e-14
ocean_prox_ISLAND -1.480939e-15 -3.788144e-16 -2.168069e-16 1.307923e-14 1.000000e+00 1.000000e+00 -1.000000e+00
ocean_prox_NEAR BAY -3.230734e-15 2.007935e-16 -1.943587e-16 1.621530e-14 1.000000e+00 1.000000e+00 -1.000000e+00
ocean_prox_NEAR OCEAN 2.386826e-15 4.623942e-16 7.088227e-16 2.445105e-14 -1.000000e+00 -1.000000e+00 1.000000e+00
In [193]:
#Checking Outliers[Outliers have already been detected in first part] 
plt.figure(figsize=(25,15))
boxplot=df_out.boxplot(patch_artist=True)
In [194]:
df_med=df_out.drop(['housing_median_age','ocean_prox_INLAND','ocean_prox_ISLAND','ocean_prox_NEAR BAY','ocean_prox_NEAR OCEAN'],axis=1)
df_med.head()
Out[194]:
median_income median_house_price
701 1.151046 1.942445
830 0.408011 0.087910
859 0.391588 0.353091
860 0.973025 0.664202
861 -0.098681 0.087043
In [195]:
df_med.hist(figsize=(9,6), xlabelsize = 10);
In [196]:
y_pred=linreg.predict(X_test)
In [197]:
y_pred
Out[197]:
array([-0.69505207,  0.621     , -0.36740922, ..., -0.42070899,
       -0.48952598, -0.308526  ])
In [198]:
# Model evaluation metrics for linear regression:

print('y-intercept             : ', linreg.intercept_)
print('beta coefficients       : ', linreg.coef_)
print('Mean Abs Error   MAE    : ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Sq  Error MSE      : ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Sq Error RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2 value                : ', metrics.r2_score(y_test, y_pred))
y-intercept             :  -0.15637811579810473
beta coefficients       :  [0.65075579]
Mean Abs Error   MAE    :  0.434647471777463
Mean Sq  Error MSE      :  0.329301652789642
Root Mean Sq Error RMSE :  0.5738481095112556
r2 value                :  0.41814070624141386
In [199]:
# Model evaluation metrics for polynomial (4th degree) regression:

print('y-intercept             : ', linreg4.intercept_)
print('beta coefficients       : ', linreg4.coef_)
print('Mean Abs Error   MAE    : ', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Sq  Error MSE      : ', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Sq Error RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('r2 value                : ', metrics.r2_score(y_test, y_pred))
y-intercept             :  -0.16803352141068423
beta coefficients       :  [ 0.          0.67794567 -0.01389087 -0.03044438  0.02639198]
Mean Abs Error   MAE    :  0.434647471777463
Mean Sq  Error MSE      :  0.329301652789642
Root Mean Sq Error RMSE :  0.5738481095112556
r2 value                :  0.41814070624141386
In [200]:
model_f = 'median_house_price ~ median_income'

model = smf.ols(formula=model_f, data=df_med)
model_fit = model.fit()
In [218]:
# fitted values (need a constant term for intercept)
model_fitted_y = model_fit.fittedvalues

# model residuals
model_residuals = model_fit.resid

# normalized residuals
model_norm_residuals = model_fit.get_influence().resid_studentized_internal

# absolute squared normalized residuals
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))

# absolute residuals
model_abs_resid = np.abs(model_residuals)

# leverage, from statsmodels internals
model_leverage = model_fit.get_influence().hat_matrix_diag

# cook's distance, from statsmodels internals
model_cooks = model_fit.get_influence().cooks_distance[0]
In [221]:
plot_lm_1 = plt.figure(1)
plot_lm_1.set_figheight(6)
plot_lm_1.set_figwidth(9)

plot_lm_1.axes[0] = sns.residplot(model_fitted_y, 'median_house_price', data=df_med, 
                          lowess=True, 
                          scatter_kws={'alpha': 0.5}, 
                          line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})

plot_lm_1.axes[0].set_title('Residuals vs Fitted')
plot_lm_1.axes[0].set_xlabel('Fitted values')
plot_lm_1.axes[0].set_ylabel('Residuals')

# annotations
abs_resid = model_abs_resid.sort_values(ascending=False)
abs_resid_top_3 = abs_resid[:3]

for i in abs_resid_top_3.index:
    plot_lm_1.axes[0].annotate(i, 
                               xy=(model_fitted_y[i], 
                                   model_residuals[i]));
In [222]:
# Plot outputs (training_data vs test_data)
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()
In [223]:
plt.scatter(X_train, y_train,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()
In [224]:
X_train = sma.add_constant(X_train) ## let's add an intercept (beta_0) to our model

X_test    = sma.add_constant(X_test)

linreg = sm.OLS(y_train,X_train).fit()
linreg.summary()
Out[224]:
OLS Regression Results
Dep. Variable: median_house_price R-squared: 0.415
Model: OLS Adj. R-squared: 0.415
Method: Least Squares F-statistic: 8454.
Date: Wed, 08 May 2019 Prob (F-statistic): 0.00
Time: 23:06:19 Log-Likelihood: -10608.
No. Observations: 11909 AIC: 2.122e+04
Df Residuals: 11907 BIC: 2.123e+04
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const -0.1564 0.006 -28.325 0.000 -0.167 -0.146
median_income 0.6508 0.007 91.945 0.000 0.637 0.665
Omnibus: 2444.478 Durbin-Watson: 1.986
Prob(Omnibus): 0.000 Jarque-Bera (JB): 5371.740
Skew: 1.184 Prob(JB): 0.00
Kurtosis: 5.285 Cond. No. 1.38


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

R2 is surprisingly decresed after removing outliers, I will check it in further DT and RF classifier. Hopefully, Bagging and Bagstrapping in RF will help to enhance the model.

In [163]:
df_out.head()
Out[163]:
housing_median_age median_income median_house_price ocean_prox_INLAND ocean_prox_ISLAND ocean_prox_NEAR BAY ocean_prox_NEAR OCEAN
701 0.267020 1.151046 1.942445 -0.681889 -0.015566 -0.353264 -0.384466
830 -1.560516 0.408011 0.087910 -0.681889 -0.015566 -0.353264 -0.384466
859 -0.607019 0.391588 0.353091 -0.681889 -0.015566 -0.353264 -0.384466
860 -1.083767 0.973025 0.664202 -0.681889 -0.015566 -0.353264 -0.384466
861 -0.686477 -0.098681 0.087043 -0.681889 -0.015566 -0.353264 -0.384466
In [201]:
print(X_medin.shape)
print(y_actual.shape)
(14887, 1)
(14887,)
In [202]:
# Splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_medin, y_actual, random_state=1, test_size=0.2)
In [221]:
# Very Low the value of r-squared, so Let's try XGboost algorithm to see if we can get better results
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=3)
In [222]:
xgb.fit(X_train,y_train)
Out[222]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.08, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.75)
In [219]:
predictions = xgb.predict(X_test)
print(explained_variance_score(predictions,y_test))
-0.2722108664683527

Not that much significant, may be I need to find out the best max_depth value and number of estimators in RF:

In [249]:
# instantiate the XGBregressor with different parameters:
xg_reg = XGBRegressor(objective ='reg:linear', 
                          colsample_bytree = 0.3, 
                          learning_rate = 0.1,
                          max_depth = 5, 
                          alpha = 10, 
                          n_estimators = 10)
In [250]:
xg_reg.fit(X_train,y_train)
Out[250]:
XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)
In [251]:
preds = xg_reg.predict(X_test)
In [252]:
print(explained_variance_score(preds,y_test))
-2.286250538199126
In [253]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
RMSE: 0.650835

Conclusion:

median_income is most important factor for the prediction of the houses, but we need to include other more variables in order to improve the model.

------END------